# Replication file for ...
# Figure 2: Number of migration-related UK newspaper articles per day mentioning nationalism, racism, and xenophobia
#Figure 1: Number of migration-related UK newspaper articles per day mentioning nationalism, racism, and xenophobia
#Figure D: Number of UK newspaper articles per day mentioning immigra* or migra* or refugee
#Figure D: Number of articles by newspaper title in the corpus
#Figure 1.1: Number of articles with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum
#Figure 1.2: Keywords with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum
#Figure 1.3: Number of articles with search terms: migra*, immigra*, OR refugee, AND Brexit OR
#referendum OR UK OR EU
#Figure 1.4: Keywords with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum OR UK OR EU

library(tm)
library(SnowballC)
library(wordcloud)
library(plyr)
library(dplyr)
library(tidytext)
library(tidyr)
library(stringr)
library(ggplot2)
library(scales)
library(widyr)
library(forcats)

# load data
brexit <- readRDS("brexit_final.rds")

# variable names
names(brexit)

# tidy tokenise brexit df -> text_df #
# preprocess, lower case and remove punctuation and tokenise ... unigrams here
text_df <- brexit %>% 
  unnest_tokens(word, text)

# remove stop words
data(stop_words)

nexis_stop_words <- c("eip.telegraph.co.uk", "_fonts", "font", "format", "http", "s3", "url", "deck", "tg", "bst", "src", "assets", "austin")
nexis_stop_words_df <- data.frame(word=nexis_stop_words, lexicon="custom")

custom_stop_words <- bind_rows(stop_words,
                               nexis_stop_words_df)

text_df <- text_df %>% 
  anti_join(custom_stop_words)

rm(nexis_stop_words_df, nexis_stop_words, stop_words)

# count number of articles per day
a_p_d <- text_df %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  rename(articles_per_day = n)

text_df <- text_df %>%
  left_join(a_p_d, by = "datetimestamp") 

rm(a_p_d)

#### MAIN PAPER Figure 2: Number of migration-related UK newspaper articles per day mentioning nationalism, racism, and xenophobia ####
#### APPENDIX D Figure 1: Number of migration-related UK newspaper articles per day mentioning nationalism, racism, and xenophobia ####
# NB This plot is in the main paper and replicated in the Appendix

# trace key words 
# chosen for the paper == nationalism, xenophobia and racism
patterns_art <- c("^nationali", "^xenopho", "^racis")

# filter dataset for 3 facet plot for article
text_df_pat_art <- text_df %>% 
  filter(grepl(paste(patterns_art, collapse="|"), word))
text_df_pat_art$word[grepl("^nationalism", text_df_pat_art$word)] <- "nationalis*"
text_df_pat_art$word[grepl("^nationalist", text_df_pat_art$word)] <- "nationalis*"
text_df_pat_art$word[grepl("^racis", text_df_pat_art$word)] <- "racis*"
text_df_pat_art$word[grepl("^xenopho", text_df_pat_art$word)] <- "xenopho*"

table(text_df_pat_art$word)
text_df_pat_art <- text_df_pat_art %>%
  filter(word=="nationalis*" | word=="xenopho*" | word=="racis*")

text_df_pat_art %>%
  group_by(word) %>% # this is key - make sure this is translated across to others!
  distinct(id, .keep_all = TRUE) %>% # distinct articles
  count(word, datetimestamp, articles_per_day) %>% # articles per day to be kept for plot
  arrange(datetimestamp) %>%
  ggplot(aes(x=datetimestamp, y=n, fill=word)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2016-06-23")), colour="black") +
  geom_col(colour="black", alpha=0.25) +
  #stat_smooth(geom="line", aes(colour=term)) +
  geom_smooth(geom="line", span=0.25, size=.5, colour="black", show.legend = FALSE) +
  scale_x_datetime(name=NULL, breaks=date_breaks("7 days"), date_labels = "%b %d") +
  theme_bw() + 
  facet_wrap(~word, ncol=1) +
  labs(caption="Data cover the official 10 week campaign period from 15 April to the poll on 23 June 2016 and the 10 weeks after the referendum\nThe wordstems used means that nationalist, racist, xenophobic etc. are included") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1)) +
  theme(legend.position="none") +
  theme(plot.caption = element_text(colour="grey50", size=7))
ggsave(file="plot_Brexit_articles_daily_mentions_pat1_ORIGINAL.pdf", width = 11.1, height = 5.1, units = "in")


#### APPENDIX D Figure 2: Number of UK newspaper articles per day mentioning immigra* or migra* or refugee ####

brexit %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  ggplot(aes(x=datetimestamp, y=n)) +
  geom_col(colour="black", fill="grey") +
  geom_smooth(geom="line", span=0.15, size=.5, colour="black", fill="red", alpha=0.25, show.legend = FALSE) +
  geom_vline(xintercept = as.POSIXct(as.Date("2016-06-23")), colour="black") +
  scale_x_datetime(name=NULL, breaks=date_breaks("7 days"), date_labels = "%b %d") +
  #labs(title="DATA: BREXIT1. METRIC: Total number of articles") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1)) 
ggsave(file="plot_Brexit_article_freq_ORIGINAL.pdf", width = 11.1, height = 5.1, units = "in")

# number for text
# descriptive statistics on corpus
brexit %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  summarise(mean= mean(n), sd= sd(n), max = max(n),min = min(n))


#### APPENDIX D Figure 3: Number of articles by newspaper title in the corpus ####

brexit %>% 
  count(title) %>% 
  mutate(title = reorder(title, n)) %>% 
  ggplot(aes(title, n)) + 
  geom_col(colour="black", fill="grey") + 
  xlab(NULL) +
  coord_flip() +
  #labs(title="Number of articles by newspaper title in the corpus") +
  theme_bw()
ggsave(file="plot_Brexit_article_title_freq.pdf", width = 11.1, height = 5.1, units = "in")

# numbers for text
brexit %>%
  group_by(title) %>%
  summarise (n = n()) %>%
  mutate(freq = n / sum(n)*100)


#### APPENDIX D Figure 4: Number of articles with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum ####

# brexit1 filtered to ONLY include articles that have the words brexit or referendum in the title or text
brexit1 <- brexit %>%
  filter(str_detect(heading, "brexit") | str_detect(text, "brexit") | str_detect(heading, "referendum") | str_detect(text, "referendum"))

brexit1 %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  ggplot(aes(x=datetimestamp, y=n)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2016-06-23")), colour="grey") +
  geom_col(colour="black", fill="grey") +
  geom_smooth(geom="line", span=0.15, size=.5, colour="black", fill="red", alpha=0.25, show.legend = FALSE) +
  scale_x_datetime(name=NULL, breaks=date_breaks("7 days"), date_labels = "%b %d") +
  #labs(title="DATA: BREXIT1. METRIC: Total number of articles") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1))
ggsave(file="plot_Brexit_article_freq_BREXIT1.pdf", width = 11.1, height = 5.1, units = "in")

# numbers for the text
brexit1 %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  summarise(mean= mean(n), sd= sd(n), max = max(n),min = min(n))


#### APPENDIX D Figure 5: Keywords with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum ####

# preprocess, lower case and remove punctuation and tokenise
text_df1 <- brexit1 %>% 
  unnest_tokens(word, text)

# remove stop words
text_df1 <- text_df1 %>% 
  anti_join(custom_stop_words)

# count number of articles per day
a_p_d <- text_df1 %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  rename(articles_per_day = n)

text_df1 <- text_df1 %>%
  left_join(a_p_d, by = "datetimestamp") 

rm(a_p_d)

# trace key words 
# chosen for the paper == nationalism, xenophobia, and racism
patterns_art <- c("^nationali", "^xenopho", "^racis")

# filter dataset for 3 facet plot for article
text_df_pat_art1 <- text_df1 %>% 
  filter(grepl(paste(patterns_art, collapse="|"), word))

text_df_pat_art1$word[grepl("^nationalism", text_df_pat_art1$word)] <- "nationalis*"
text_df_pat_art1$word[grepl("^nationalist", text_df_pat_art1$word)] <- "nationalis*"
text_df_pat_art1$word[grepl("^racis", text_df_pat_art1$word)] <- "racis*"
text_df_pat_art1$word[grepl("^xenopho", text_df_pat_art1$word)] <- "xenopho*"

table(text_df_pat_art1$word)

text_df_pat_art1 <- text_df_pat_art1 %>%
  filter(word=="nationalis*" | word=="xenopho*" | word=="racis*")

text_df_pat_art1 %>%
  group_by(word) %>% # this is key - make sure this is translated across to others!
  distinct(id, .keep_all = TRUE) %>% # distinct articles
  count(word, datetimestamp, articles_per_day) %>% # articles per day to be kept for plot
  arrange(datetimestamp) %>%
  ggplot(aes(x=datetimestamp, y=n, fill=word)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2016-06-23")), colour="grey") +
  geom_col(colour="black", alpha=0.25) +
  #stat_smooth(geom="line", aes(colour=term)) +
  geom_smooth(geom="line", span=0.25, size=.5, colour="black", show.legend = FALSE) +
  scale_x_datetime(name=NULL, breaks=date_breaks("7 days"), date_labels = "%b %d") +
  theme_bw() + 
  facet_wrap(~word, ncol=1) +
  labs(caption="Data cover the official 10 week campaign period from 15 April to the poll on 23 June 2016 and the 10 weeks after the referendum\nThe wordstems used means that nationalist, racist, xenophobic etc. are included") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1)) +
  theme(legend.position="none") +
  theme(plot.caption = element_text(colour="grey50", size=7))
ggsave(file="plot_Brexit_articles_daily_mentions_pat1_BREXIT1.pdf", width = 11.1, height = 5.1, units = "in")


#### APPENDIX D Figure 6: Number of articles with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum OR UK OR EU ####

# brexit2 filtered to ONLY include articles that have the words brexit or referendum or eu or uk in the title or text
brexit2 <- brexit %>%
  filter(str_detect(heading, "brexit") | str_detect(text, "brexit") | str_detect(heading, "referendum") | str_detect(text, "referendum")| str_detect(heading, "eu") | str_detect(text, "eu") | str_detect(heading, "uk") | str_detect(text, "uk"))

# Frequency of articles
brexit2 %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  ggplot(aes(x=datetimestamp, y=n)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2016-06-23")), colour="grey") +
  geom_col(colour="black", fill="grey") +
  geom_smooth(geom="line", span=0.15, size=.5, colour="black", fill="red", alpha=0.25, show.legend = FALSE) +
  scale_x_datetime(name=NULL, breaks=date_breaks("7 days"), date_labels = "%b %d") +
  #labs(title="DATA: BREXIT1. METRIC: Total number of articles") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1))
ggsave(file="plot_Brexit_article_freq_BREXIT2.pdf", width = 11.1, height = 5.1, units = "in")

# numbers for text
brexit %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  summarise(mean= mean(n), sd= sd(n), max = max(n),min = min(n))


#### Figure 7: Keywords with search terms: migra*, immigra*, OR refugee, AND Brexit OR referendum OR UK OR EU ####

# preprocess, lower case and remove punctuation and tokenise ... unigrams here
text_df2 <- brexit2 %>% 
  unnest_tokens(word, text)

# remove stop words
text_df2 <- text_df2 %>% 
  anti_join(custom_stop_words)

# count number of articles per day
a_p_d <- text_df2 %>%
  distinct(id, .keep_all = TRUE) %>% 
  count(datetimestamp) %>%
  rename(articles_per_day = n)

text_df2 <- text_df2 %>%
  left_join(a_p_d, by = "datetimestamp") 

rm(a_p_d)

# trace key words 
# chosen for the paper == nationalism, xenophobia, and racism
patterns_art <- c("^nationali", "^xenopho", "^racis")

# filter dataset for 3 facet plot for article
text_df_pat_art2 <- text_df2 %>% 
  filter(grepl(paste(patterns_art, collapse="|"), word))

text_df_pat_art2$word[grepl("^nationalism", text_df_pat_art2$word)] <- "nationalis*"
text_df_pat_art2$word[grepl("^nationalist", text_df_pat_art2$word)] <- "nationalis*"
text_df_pat_art2$word[grepl("^racis", text_df_pat_art2$word)] <- "racis*"
text_df_pat_art2$word[grepl("^xenopho", text_df_pat_art2$word)] <- "xenopho*"

table(text_df_pat_art2$word)

text_df_pat_art2 <- text_df_pat_art2 %>%
  filter(word=="nationalis*" | word=="xenopho*" | word=="racis*")

text_df_pat_art2 %>%
  group_by(word) %>% # this is key - make sure this is translated across to others!
  distinct(id, .keep_all = TRUE) %>% # distinct articles
  count(word, datetimestamp, articles_per_day) %>% # articles per day to be kept for plot
  arrange(datetimestamp) %>%
  ggplot(aes(x=datetimestamp, y=n, fill=word)) +
  geom_vline(xintercept = as.POSIXct(as.Date("2016-06-23")), colour="grey") +
  geom_col(colour="black", alpha=0.25) +
  #stat_smooth(geom="line", aes(colour=term)) +
  geom_smooth(geom="line", span=0.25, size=.5, colour="black", show.legend = FALSE) +
  scale_x_datetime(name=NULL, breaks=date_breaks("7 days"), date_labels = "%b %d") +
  theme_bw() + 
  facet_wrap(~word, ncol=1) +
  labs(caption="Data cover the official 10 week campaign period from 15 April to the poll on 23 June 2016 and the 10 weeks after the referendum\nThe wordstems used means that nationalist, racist, xenophobic etc. are included") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, hjust=1)) +
  theme(legend.position="none") +
  theme(plot.caption = element_text(colour="grey50", size=7))
ggsave(file="plot_Brexit_articles_daily_mentions_pat1_BREXIT2.pdf", width = 11.1, height = 5.1, units = "in")


